We cannot deny that fact that philosophy is hard to read and understand especially for someone studies mathematics like me. But it’s still necessary for learn about philosophy because every time we think, we are a philosopher. So, I would let the machine to do the boring stuff for me. And in the same time, I try to capture some numerical features of the philosophy data.
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import seaborn as sb
from wordcloud import WordCloud, STOPWORDS, ImageColorGenerator
import nltk
The basic data given is around 360,000 including string and numbers. The basic idea is to apply numerical methods for the digital data. For the string part, calculate the frequency and apply sentiment analysis should work.
#read the given data
df=pd.read_csv('philosophy_data.csv')
#preview the data
df.info()
#print(df)
Notice that the tokenized sentence which is the underlying data for later use is in the form of letters seperately. So we need to transform the data into words.
#derive the words from tokenized_txt
df['number_of_tokens'] = list(map(len,map(eval,df.tokenized_txt)))
df['tokens_derived'] =list( map(eval,df.tokenized_txt))
#df['tokens_derived'].iloc[0][0]
#visualize overall data grouped by title author and school
group_method_list=['title','author','school']
#preview the data by graph which is grouped by features
print(pd.DataFrame( df.groupby(by=['school','author','title'])['title'].count() ))
for method in group_method_list:
plt.figure(figsize=(16,6))
df[method].value_counts().plot(kind='bar',color='g')
plt.title(method)
plt.show()
For the digital numbers which are sentence length of the data and number of tokens, we can visualize the frequency of it and see what happens to different schools.
#Some of the numerical features
#Sentence length
df.sentence_length.describe()
plt.figure(figsize=(16,6))
df.sentence_length.plot(kind='hist',bins=300)
plt.title('Sentence_length_frequecy')
plt.show()
#view by different schools
schools=df.school.unique().tolist()
plt.figure(figsize=(16,6))
sb.violinplot(x='school', y='sentence_length', data=df)
plt.title('sentence length grouped by School')
plt.grid()
#view by different authors
authors=df.author.unique().tolist()
plt.figure(figsize=(30,6))
sb.violinplot(x='author', y='sentence_length', data=df)
plt.title('sentence length grouped by author')
plt.grid()
#number of tokens
df.number_of_tokens.describe()
plt.figure(figsize=(16,6))
df.number_of_tokens.plot(kind='hist',bins=150)
plt.title('number_of_tokens_frequency')
plt.show()
#view by different schools
plt.figure(figsize=(16,6))
sb.violinplot(x='school', y='number_of_tokens', data=df)
plt.title('number_of_tokens grouped by School')
plt.grid()
#view by different authors
plt.figure(figsize=(30,6))
sb.violinplot(x='author', y='number_of_tokens', data=df)
plt.title('number_of_tokens grouped by author')
plt.grid()
pd.DataFrame( df.groupby(by=['school','author']).mean() )
After looking into schools seperately, the next idea is to look into how these schools correlated. The first step is to eliminate duplicated tokens.
#Check the difference among different schools by token
sum_of_tokens=[]
number_list=[]
for sc in schools:
df_temp = df[df.school==sc]
print('School = ', sc.upper(), ':')
token_list=df_temp.tokens_derived.tolist()
token_school=[]
for tokens in token_list:
token_school+=tokens
uniquelist=[]
for token in token_school:
if token not in uniquelist:
uniquelist.append(token)
print('total number of tokens is',np.shape(token_school)[0])
print('total number of unique tokens is',np.shape(uniquelist)[0])
sum_of_tokens.append(uniquelist)
#print(np.shape(sum_of_tokens))
matrix=np.zeros(shape=(13,13))
for i in range(13):
for j in range(13):
sametoken=0
for k in range(len(sum_of_tokens[j])):
if sum_of_tokens[j][k] in sum_of_tokens[i]:
sametoken+=1
matrix[i][j]=sametoken/len(sum_of_tokens[i])
print(matrix)
f,ax = plt.subplots(figsize=(12,10))
ax=sb.heatmap(matrix.round(2),xticklabels=schools,yticklabels=schools,annot=True,cmap="YlGnBu")
plt.title('corelations between different schools')
Then after looking into the uniqueness of the tokens, the next idea is how the tokens repeat in each schools. So based on the frequency, we can make a word cloud graph.
stopwords = set(STOPWORDS)
for sc in schools:
df_temp = df[df.school==sc]
print('School = ', sc.upper(), ':')
# render wordcloud
text = " ".join(txt for txt in df_temp.sentence_lowered)
wordcloud = WordCloud(stopwords=stopwords, max_font_size=80, max_words=300,
width = 600, height = 400,
background_color="white").generate(text)
plt.figure(figsize=(12,8))
plt.imshow(wordcloud, interpolation="bilinear")
plt.axis("off")
plt.show()
Finally, after all the analysis of the outlook of the sentences(data), we can analyse the meaning of it. Based on sentiment analysis. We can get pie chart to see their atitude is positive, negative or neutral.
from nltk.sentiment.vader import SentimentIntensityAnalyzer
def SentimentAnlysis(sentence):
sentAnalyzer = SentimentIntensityAnalyzer()
sentDict = sentAnalyzer.polarity_scores(sentence)
if sentDict['compound'] >= 0.05:
return "positive"
elif sentDict['compound'] <= -0.05 :
return "negative"
else:
return "neutral"
def Analyzer(data, author):
df = data [data['author'] == author]
#making a Corpus and finding sentiments
corpus = ''
numPostives = 0
numNegatives = 0
numNeutrals = 0
for mem in df['sentence_lowered']:
corpus += mem
for i in range (len(df)):
sent = (SentimentAnlysis(df['sentence_lowered'].iloc[i]))
if sent == "positive":
numPostives += 1
elif sent == "negative":
numNegatives += 1
else:
numNeutrals += 1
plt.figure(figsize = (7, 7))
plt.pie([numPostives, numNegatives, numNeutrals], labels = ['positives', 'negatives', 'neutrals'],colors=['r','b','y'], autopct='%1.2f%%')
plt.title('Sentiment Analysis for Philosopher: ' + author)
for author in authors:
Analyzer(df,author)
def Analyzer_1(data, school):
df = data [data['school'] == school]
#making a Corpus and finding sentiments
corpus = ''
numPostives = 0
numNegatives = 0
numNeutrals = 0
for mem in df['sentence_lowered']:
corpus += mem
for i in range (len(df)):
sent = (SentimentAnlysis(df['sentence_lowered'].iloc[i]))
if sent == "positive":
numPostives += 1
elif sent == "negative":
numNegatives += 1
else:
numNeutrals += 1
plt.figure(figsize = (7, 7))
plt.pie([numPostives, numNegatives, numNeutrals], labels = ['positives', 'negatives', 'neutrals'],colors=['r','b','y'], autopct='%1.2f%%')
plt.title('Sentiment Analysis for School: ' + school)
for school in schools:
Analyzer_1(df,school)